import re
from bs4 import BeautifulSoup
import jieba
import nltk
from nltk.corpus import stopwords

# 下载停用词表
nltk.download('stopwords')

def preprocess_email(content):
    """
    预处理邮件内容
    参数:
        content: 原始邮件内容
    返回:
        处理后的文本
    """
    # 1. 去除HTML标签
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()
    
    # 2. 去除特殊字符和数字
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
    
    # 3. 分词
    words = jieba.cut(text)
    
    # 4. 去除停用词
    stop_words = set(stopwords.words('chinese') + stopwords.words('english'))
    words = [word for word in words if word not in stop_words and len(word.strip()) > 0]
    
    # 5. 重新组合文本
    processed_text = ' '.join(words)
    
    return processed_text 